In this project music clips are classified into two different genres (Fast and Slow). Playlist of 60 songs(30 of each genre) has been created and 3 clips of 30 sec at different time locations from each song has been taken.
Features namely MFCC's, zero crossing rate, spectral centroid, spectral bandwidth, spectral roll-off, spectral contrast are extracted from these clips. Dataset of these extracted features is used to train a machine learning classifier named K-nearest neighbour.
Different configurations of features are used for classification to know the features which are best suited for our classification task. We have also identified and analysed incorrectly classified music clips.
import numpy as np
import os
from google.colab import drive
import IPython.display as ipd
import librosa
import scipy as sp
from scipy import signal
from scipy.fftpack import fft
from scipy.io import wavfile
!pip install pydub
from os import path
from pydub import AudioSegment
import soundfile as sf
import librosa.display
import sklearn
import pandas
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pandas
import seaborn as sns
%matplotlib inline
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/Jazz_songs")
audio_files = os.listdir()
# You dont need the number of files in the folder, just iterate over them directly using:
for file in audio_files:
#spliting the file into the name and the extension
name, ext = os.path.splitext(file)
if ext == ".mp3":
mp3_sound = AudioSegment.from_mp3(file)
#rename them using the old name + ".wav"
mp3_sound.export("/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/slow_wav/{0}.wav".format(name), format="wav")
os.chdir("/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/rock_songs")
audio_files = os.listdir()
# You dont need the number of files in the folder, just iterate over them directly using:
for file in audio_files:
#spliting the file into the name and the extension
name, ext = os.path.splitext(file)
if ext == ".mp3":
mp3_sound = AudioSegment.from_mp3(file)
#rename them using the old name + ".wav"
mp3_sound.export("/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/fast_wav/{0}.wav".format(name), format="wav")
#..............List of slow songs......................................
list_slow = os.listdir("/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/slow_wav/")
#..............List of fast songs......................................
list_fast=os.listdir("/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/fast_wav/")
import random
fs=22050
frame=np.zeros(30*fs)
iter=0
path_1="/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/fast_wav/"
path_2="/content/gdrive/Shareddrives/Speech_processing_project/frames/fast/"
for i in range(0,30):
y, fs= librosa.load(str(path_1)+list_fast[i], mono = True)
for j in range(0,3):
iter+=1
random_no=random.randint(30,int(y.size/22050))
frame=y[(random_no-30)*fs:random_no*fs]
sf.write(path_2 + "fast.{0}.wav".format(iter), frame,fs)
frame=np.zeros(30*fs)
iter=0
path_1="/content/gdrive/Shareddrives/Speech_processing_project/songs_playlist/slow_wav/"
path_2="/content/gdrive/Shareddrives/Speech_processing_project/frames/slow/"
for i in range(0,30):
y, fs= librosa.load(str(path_1)+list_slow[i], mono = True)
for j in range(0,3):
iter+=1
random_no=random.randint(30,int(y.size/22050))
frame=y[(random_no-30)*fs:random_no*fs]
sf.write(path_2 + "slow.{0}.wav".format(iter), frame,fs)
#..............List of slow songs......................................
list_fast = os.listdir("/content/gdrive/Shareddrives/Speech_processing_project/frames/fast/")
#..............List of fast songs......................................
list_slow=os.listdir("/content/gdrive/Shareddrives/Speech_processing_project/frames/slow/")
pathAudio_fast = "/content/gdrive/Shareddrives/Speech_processing_project/frames/fast/"
pathAudio_slow= "/content/gdrive/Shareddrives/Speech_processing_project/frames/slow/"
y_fast, fs= librosa.load(str(pathAudio_fast)+list_fast[50], mono = True)
y_slow, fs= librosa.load(str(pathAudio_slow)+list_slow[40], mono = True)
fig=plt.figure(figsize=(12,5))
fig.subplots_adjust(hspace=.5)
plt.subplot(2, 1, 1)
plt.xlabel("Time")
plt.ylabel("Amplitude")
librosa.display.waveplot(y_fast,fs)
plt.title('new/fast')
plt.subplot(2, 1, 2)
plt.xlabel("Time")
plt.ylabel("Amplitude")
librosa.display.waveplot(y_slow,fs)
plt.title('old/slow')
Fast/New songs have high Amplitude as compared to slow/old songs because fast songs are loud and harsh.
Fast/New Songs have high Zero Crossing Rate as compared to slow/old songs because slow/old songs are comparitively smooth and does not have many noisy contents of high frequency.
ipd.Audio(y_fast, rate=fs)
ipd.Audio(y_slow, rate=fs)
The mel spectrogram remaps the values in hertz to the mel scale according to the following conversion formula:
$ m=2595\log_{10}(1+\frac{f}{700}) $
$ f=700(e^{\frac{m}{2595}}-1) $
window_size=2048
fig=plt.figure(figsize=(12,5))
fig.subplots_adjust(hspace=.5)
plt.subplot(2,1,1)
S = librosa.feature.melspectrogram(y_fast, sr=22050, n_fft=window_size)
logS = librosa.power_to_db(S)
plt.title('Fast/New')
librosa.display.specshow(logS, sr=22050, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.subplot(2,1,2)
S2 = librosa.feature.melspectrogram(y_slow, sr=22050, n_fft=window_size)
logS2 = librosa.power_to_db(S2)
plt.title('Slow/Old')
librosa.display.specshow(logS2, sr=22050, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
Fast/New Songs spectrum is spread over wide frequency range as Fast/New songs uses many instruments and each instruments corresponds to different frequencies.
Whereas Slow/old songs uses fewer instruments. Therefore we can see only some of the frequency components.
def get_subdirectories(path):
return [name for name in os.listdir(path)
if os.path.isdir(os.path.join(path, name))]
def get_sample_arrays(path, song_name, samp_rate):
path_of_audios = librosa.util.find_files(path+song)
audios = []
for audio in path_of_audios:
x, sr = librosa.load(audio, sr=samp_rate, duration=30.0)
audios.append(x)
audios_numpy = np.array(audios)
return audios_numpy
def extract_features(signal, sample_rate, frame_size, hop_size):
zero_crossing_rate = librosa.feature.zero_crossing_rate(y=signal, frame_length=frame_size, hop_length=hop_size)
spectral_centroid = librosa.feature.spectral_centroid(y=signal, sr=sample_rate, n_fft=frame_size,
hop_length=hop_size)
spectral_contrast = librosa.feature.spectral_contrast(y=signal, sr=sample_rate, n_fft=frame_size,
hop_length=hop_size)
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=signal, sr=sample_rate, n_fft=frame_size,
hop_length=hop_size)
spectral_rolloff = librosa.feature.spectral_rolloff(y=signal, sr=sample_rate, n_fft=frame_size, hop_length=hop_size)
mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_fft=frame_size, hop_length=hop_size)
return [
np.mean(zero_crossing_rate),
np.std(zero_crossing_rate),
np.mean(spectral_centroid),
np.std(spectral_centroid),
np.mean(spectral_contrast),
np.std(spectral_contrast),
np.mean(spectral_bandwidth),
np.std(spectral_bandwidth),
np.mean(spectral_rolloff),
np.std(spectral_rolloff),
np.mean(mfccs[1, :]),
np.std(mfccs[1, :]),
np.mean(mfccs[2, :]),
np.std(mfccs[2, :]),
np.mean(mfccs[3, :]),
np.std(mfccs[3, :]),
np.mean(mfccs[4, :]),
np.std(mfccs[4, :]),
np.mean(mfccs[5, :]),
np.std(mfccs[5, :]),
np.mean(mfccs[6, :]),
np.std(mfccs[6, :]),
np.mean(mfccs[7, :]),
np.std(mfccs[7, :]),
np.mean(mfccs[8, :]),
np.std(mfccs[8, :]),
np.mean(mfccs[9, :]),
np.std(mfccs[9, :]),
np.mean(mfccs[10, :]),
np.std(mfccs[10, :]),
np.mean(mfccs[11, :]),
np.std(mfccs[11, :]),
np.mean(mfccs[12, :]),
np.std(mfccs[12, :]),
np.mean(mfccs[13, :]),
np.std(mfccs[13, :]),
]
def csv(data_set):
#Here output class is removed from input and train/test split is done
X=data_set.drop(['genre'],axis='columns')
y=data_set.genre
return X,y
def data_split(data_set):
X,y=csv(data_set)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.40)
return X_train,X_test,y_train,y_test
def model(data_set):
X_train,X_test,y_train,y_test=data_split(data_set)
#..............Applying KNN model.........................................
results_knn=[]
#............iterations for obtaining best k value........................
for i in range(1,40):
knn=KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
results_knn.append(knn.score(X_test,y_test))
max_accuracy_knn=max(results_knn)
best_k=results_knn.index(max(results_knn))
print("Max Accuracy is {:.3f} on test dataset with {} neighbors.\n".format(max_accuracy_knn,best_k))
plt.plot(np.arange(1,40),results_knn)
plt.xlabel("n Neighbors")
plt.ylabel("Test Accuracy")
plt.grid()
#..............Training model with best k value...........................
knn=KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train,y_train)
print("Training Score: {:.3f}".format(knn.score(X_train,y_train)))
print("Test score: {:.3f}".format(knn.score(X_test,y_test)))
# y_pred = knn.predict(X_test)
return X_test,y_test,knn
def confusion_mat(X_test,y_test,knn):
ax= plt.subplot()
y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='g', ax=ax)
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['fast', 'slow']); ax.yaxis.set_ticklabels(['fast','slow'])
return y_pred
path='/content/gdrive/Shareddrives/Speech_processing_project/frames/'
song_list=get_subdirectories(path)
labels=[]
fs=22050
frame_size=2048
hop_size=512
is_created = False
for song in song_list:
sample_arrays = get_sample_arrays(path, song, samp_rate=22050)
for sample_array in sample_arrays:
row= extract_features(sample_array, fs,frame_size, hop_size)
row=row[:10]
if not is_created:
dataset_numpy = np.array(row)
is_created = True
elif is_created:
dataset_numpy = np.vstack((dataset_numpy, row))
labels.append(song)
#..................Standardizing the variables..........................
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))
dataset_numpy = scaler.fit_transform(dataset_numpy)
Feature_Names = ['meanZCR', 'stdZCR', 'meanSpecCentroid', 'stdSpecCentroid', 'meanSpecContrast', 'stdSpecContrast',
'meanSpecBandwidth', 'stdSpecBandwidth', 'meanSpecRollof', 'stdSpecRollof',]
dataset_pandas = pandas.DataFrame(dataset_numpy, columns=Feature_Names)
dataset_pandas["genre"] = labels
dataset_pandas.to_csv("data_set.csv", index=False)
data_set = pandas.read_csv('data_set.csv',index_col=False)
genre=['fast','slow']
sns.pairplot(data_set,hue='genre')
Diagonal plots shows the distribution of given features wrt two genres.
Plot of meanZCR verifies that slow/old songs have low zero crossing rate as compared to fast/New songs. Therefore slow/old songs are comparetively smoother.
Plot of meanspecCentroid verifies that fast/New songs have high spectral centroid then slow/old songs which signifies that fast/New songs are more bright and more energy of the signal being concentrated within higher frequencies.
Plot of meanspecContrast verifies that slow/old songs have high spectral contrast then fast/New songs which signifies that Slow/old songs are relatively more clear and narrow-band signals than fast/New songs.
Plot of meanSpecBandwidth verifies that fast\new songs have more spectral bandwidth than slow\old songs.
Spectral Roll off indicates the frequency below which a specified percentage of the total spectral energy (nearly 85%) lies.
Plot of meanSpecRolloff verifies that fast\new songs have relatively large spectral Roll off frequency then slow/old songs which signifies that spectral energy of fast/new songs is spread relatively over large frequency range than for slow/old songs.
data_set[85:95].style
sns.countplot(data_set['genre'])
X_test,y_test,knn=model(data_set)
y_pred=confusion_mat(X_test,y_test,knn)
path='/content/gdrive/Shareddrives/Speech_processing_project/frames/'
song_list=get_subdirectories(path)
labels=[]
frame_size=2048
hop_size=512
is_created = False
for song in song_list:
sample_arrays = get_sample_arrays(path, song, samp_rate=22050)
for sample_array in sample_arrays:
row= extract_features(sample_array, fs,frame_size, hop_size)
row=row[10:22]
if not is_created:
dataset_numpy1 = np.array(row)
is_created = True
elif is_created:
dataset_numpy1 = np.vstack((dataset_numpy1, row))
labels.append(song)
#..................Standardizing the variables..........................
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))
dataset_numpy1 = scaler.fit_transform(dataset_numpy1)
Feature_Names = ['meanMFCC_1', 'stdMFCC_1', 'meanMFCC_2',
'stdMFCC_2', 'meanMFCC_3', 'stdMFCC_3','meanMFCC_4', 'stdMFCC_4',
'meanMFCC_5', 'stdMFCC_5', 'meanMFCC_6', 'stdMFCC_6',]
dataset_pandas = pandas.DataFrame(dataset_numpy1, columns=Feature_Names)
dataset_pandas["genre"] = labels
dataset_pandas.to_csv("data_set.csv", index=False)
data_set1=pandas.read_csv('data_set.csv',index_col=False)
GENRES=['fast','slow']
sns.pairplot(data_set1,hue='genre')
The lower order coefficients contain most of the information about the overall spectral shape of the source-filter transfer function.
Higher order coefficients represent increasing levels of spectral details.
data_set1[85:95].style
X_test,y_test,knn=model(data_set1)
confusion_mat(X_test,y_test,knn)
path='/content/gdrive/Shareddrives/Speech_processing_project/frames/'
song_list=get_subdirectories(path)
labels=[]
frame_size=2048
hop_size=512
is_created = False
for song in song_list:
sample_arrays = get_sample_arrays(path, song, samp_rate=22050)
for sample_array in sample_arrays:
row= extract_features(sample_array, fs,frame_size, hop_size)
row=row[22:36]
if not is_created:
dataset_numpy2 = np.array(row)
is_created = True
elif is_created:
dataset_numpy2 = np.vstack((dataset_numpy2, row))
labels.append(song)
#..................Standardizing the variables..........................
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))
dataset_numpy2 = scaler.fit_transform(dataset_numpy2)
Feature_Names =['meanMFCC_7', 'stdMFCC_7', 'meanMFCC_8', 'stdMFCC_8', 'meanMFCC_9', 'stdMFCC_9',
'meanMFCC_10', 'stdMFCC_10', 'meanMFCC_11', 'stdMFCC_11', 'meanMFCC_12', 'stdMFCC_12',
'meanMFCC_13', 'stdMFCC_13']
dataset_pandas = pandas.DataFrame(dataset_numpy2, columns=Feature_Names)
dataset_pandas["genre"] = labels
dataset_pandas.to_csv("data_set.csv", index=False)
data_set2=pandas.read_csv('data_set.csv',index_col=False)
GENRES=['fast','slow']
sns.pairplot(data_set2,hue='genre')
data_set2[85:95].style
X_test,y_test,knn=model(data_set2)
confusion_mat(X_test,y_test,knn)
path='/content/gdrive/Shareddrives/Speech_processing_project/frames/'
song_list=get_subdirectories(path)
labels=[]
frame_size=2048
hop_size=512
is_created = False
for song in song_list:
sample_arrays = get_sample_arrays(path, song, samp_rate=22050)
for sample_array in sample_arrays:
row= extract_features(sample_array, fs,frame_size, hop_size)
if not is_created:
dataset_numpy3 = np.array(row)
is_created = True
elif is_created:
dataset_numpy3 = np.vstack((dataset_numpy3, row))
labels.append(song)
#..................Standardizing the variables..........................
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))
dataset_numpy3 = scaler.fit_transform(dataset_numpy3)
Feature_Names = ['meanZCR', 'stdZCR', 'meanSpecCentroid', 'stdSpecCentroid', 'meanSpecContrast', 'stdSpecContrast',
'meanSpecBandwidth', 'stdSpecBandwidth', 'meanSpecRollof', 'stdSpecRollof',
'meanMFCC_1', 'stdMFCC_1', 'meanMFCC_2', 'stdMFCC_2', 'meanMFCC_3', 'stdMFCC_3',
'meanMFCC_4', 'stdMFCC_4', 'meanMFCC_5', 'stdMFCC_5', 'meanMFCC_6', 'stdMFCC_6',
'meanMFCC_7', 'stdMFCC_7', 'meanMFCC_8', 'stdMFCC_8', 'meanMFCC_9', 'stdMFCC_9',
'meanMFCC_10', 'stdMFCC_10', 'meanMFCC_11', 'stdMFCC_11', 'meanMFCC_12', 'stdMFCC_12',
'meanMFCC_13', 'stdMFCC_13'
]
dataset_pandas = pandas.DataFrame(dataset_numpy3, columns=Feature_Names)
dataset_pandas["genre"] = labels
dataset_pandas.to_csv("data_set.csv", index=False)
data_set3=pandas.read_csv('data_set.csv',index_col=False)
GENRES=['fast','slow']
data_set3[85:95].style
X_test,y_test,knn=model(data_set3)
y_pred=confusion_mat(X_test,y_test,knn)
Y_pred=[]
Y_test=[]
for i in range(len(y_test)):
Y_pred.append(y_pred[i])
Y_pred=np.array(y_pred)
Y_test=np.array(y_test)
for i in range(len(Y_pred)):
if Y_pred[i]!=Y_test[i]:
print(i)
print('Predicted label:',Y_pred[i])
print('Actual label:',Y_test[i])
print(X_test[9:10]);print(X_test[37:38]);print(X_test[56:57]);print(X_test[57:58])
Total music clips=180
Slow music clips= 90
Fast music clips= 90
For our dataset,
When incorrectly classified index is greater than or equal to 90, we get the actual clip index of the wrong classified clip as-
Actual clip index = Index - 90 - 1
When incorrectly classified index is less than 90, we get the actual clip index of the wrong classified clip as-
Actual clip index = Index - 1
here, we got first incorrectly classified index as 153, Actual clip index = (153-90-1) = 62
similarly,
(103-90-1) = 12
(141-90-1) = 50
(134-90-1) = 43
pathAudio_fast = "/content/gdrive/Shareddrives/Speech_processing_project/frames/fast/"
pathAudio_slow= "/content/gdrive/Shareddrives/Speech_processing_project/frames/slow/"
# y_fast, fs= librosa.load(str(pathAudio_fast)+list_fast[50], mono = True)
y_slow, fs= librosa.load(str(pathAudio_slow)+list_slow[62], mono = True)
ipd.Audio(y_slow, rate=fs)
pathAudio_slow= "/content/gdrive/Shareddrives/Speech_processing_project/frames/slow/"
# y_fast, fs= librosa.load(str(pathAudio_fast)+list_fast[50], mono = True)
y_slow, fs= librosa.load(str(pathAudio_slow)+list_slow[12], mono = True)
ipd.Audio(y_slow, rate=fs)
pathAudio_slow= "/content/gdrive/Shareddrives/Speech_processing_project/frames/slow/"
# y_fast, fs= librosa.load(str(pathAudio_fast)+list_fast[50], mono = True)
y_slow, fs= librosa.load(str(pathAudio_slow)+list_slow[50], mono = True)
ipd.Audio(y_slow, rate=fs)
pathAudio_slow= "/content/gdrive/Shareddrives/Speech_processing_project/frames/slow/"
# y_fast, fs= librosa.load(str(pathAudio_fast)+list_fast[50], mono = True)
y_slow, fs= librosa.load(str(pathAudio_slow)+list_slow[43], mono = True)
ipd.Audio(y_slow, rate=fs)
As we have taken a random 30 sec music clips from the bollywood songs and the bollywood songs are known for using a wide variety of accompaniments in a single song.
So after playing above wrongly classified clips, we observed that in these particular clips certain instruments are used which might not be used in the whole song, so they may be misleading our model during classification, which can be the case for above wrongly classified clips.
We have done this analysis only when all the features are used together for classification because we are getting maximum accuracy in this case.
Four configurations are as follows:-
Classification using Zero crossing rate, spectral centroid, spectral contrast, spectral bandwidth, spectral roll-off.
Classification using lower 6 MFCC coefficients.
From the above 4 configurations we are getting comparatively better accuracy for first and fourth configuration.
From this we can conclude that for music genre classification, the features used in first configuration are more important as compared to MFCC coefficients.
Also from the diagonal plots we can observe that the features used in first configuration are having well seperated distribution for each genre.
As far as most important features are concerned, it is evident from the diagonal plots that meanSpecContrast, meanSpecBandwidth and meanSpecRollof are the most important features.
%%shell
jupyter nbconvert --to html /content/Spech_project.ipynb